{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.io import loadmat\n",
    "from sklearn.preprocessing import PolynomialFeatures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pylab import mpl\n",
    "\n",
    "mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体\n",
    "mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def h(theta, x):\n",
    "    \"\"\"预测函数\n",
    "\n",
    "    Args:\n",
    "        theta 模型参数\n",
    "        x 特征向量\n",
    "\n",
    "    Returns:\n",
    "        预测结果\n",
    "    \"\"\"\n",
    "    return (theta.T * x)[0, 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def J(theta, X, y, theLambda=0):\n",
    "    \"\"\"代价函数\n",
    "\n",
    "    Args:\n",
    "        theta 模型参数\n",
    "        X 样本特征\n",
    "        y 样本标签\n",
    "\n",
    "    Returns:\n",
    "        预测误差（代价）\n",
    "    \"\"\"\n",
    "    m = len(X)\n",
    "    return (X * theta - y).T * (X * theta - y) / (2 * m) + theLambda * np.sum(np.square(theta)) / (2*m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def gradient(X, y, alpha=1, maxLoop=50, epsilon=1e-5, theLambda=0, initTheta=None):\n",
    "    \"\"\"批量梯度下降法\n",
    "\n",
    "    Args:\n",
    "        X 样本特征\n",
    "        y 样本标签\n",
    "        alpha 学习率\n",
    "        maxLoop 最大迭代次数\n",
    "        epsilon 收敛精度\n",
    "        theLambda 正则化参数\n",
    "    Returns:\n",
    "        theta, errors\n",
    "    \"\"\"\n",
    "    m, n = X.shape\n",
    "    # 初始化theta\n",
    "    if initTheta is None:\n",
    "        theta = np.zeros((n, 1))\n",
    "    else:\n",
    "        theta = initTheta\n",
    "    count = 0\n",
    "    \n",
    "    error = float('inf')\n",
    "    errors = [error,]\n",
    "    for i in range(maxLoop):\n",
    "        theta = theta + (1.0 / m) * alpha * ((y - X * theta).T * X).T\n",
    "        error = J(theta, X, y, theLambda)\n",
    "        if np.isnan(error):\n",
    "            error = np.inf\n",
    "        \n",
    "        errors.append(error)\n",
    "        # 如果已经收敛\n",
    "        if(abs(errors[-1]-errors[-2]) < epsilon):\n",
    "            break\n",
    "    print 'iterating',i\n",
    "    return theta, errors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def normalize(X):\n",
    "    \"\"\"特征归一化处理\n",
    "\n",
    "    Args:\n",
    "        X 样本集\n",
    "    Returns:\n",
    "        归一化后的样本集\n",
    "    \"\"\"\n",
    "    m, n = X.shape\n",
    "    # 归一化每一个特征\n",
    "    for j in range(n):\n",
    "        features = X[:,j]\n",
    "        minVal = features.min(axis=0)\n",
    "        maxVal = features.max(axis=0)\n",
    "        diff = maxVal - minVal\n",
    "        if diff != 0:\n",
    "           X[:,j] = (features-minVal)/diff\n",
    "        else:\n",
    "           X[:,j] = 0\n",
    "    return X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = loadmat('data/water.mat')\n",
    "#####\n",
    "# 数据集划分\n",
    "#####\n",
    "# 训练集\n",
    "X = np.mat(data['X'])\n",
    "# 添加偏置\n",
    "X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)\n",
    "y = np.mat(data['y'])\n",
    "# 交叉验证集\n",
    "Xval = np.mat(data['Xval'])\n",
    "Xval = np.concatenate((np.ones((Xval.shape[0], 1)), Xval), axis=1)\n",
    "yval = np.mat(data['yval'])\n",
    "# 测试集\n",
    "Xtest = np.mat(data['Xtest'])\n",
    "Xtest = np.concatenate((np.ones((Xtest.shape[0], 1)), Xtest), axis=1)\n",
    "ytest = np.mat(data['ytest'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def diagnoseLR():\n",
    "    \"\"\"线性回归诊断\n",
    "    \"\"\"\n",
    "    initTheta = np.mat(np.ones((X.shape[1], 1)))\n",
    "    theta, errors = gradient(\n",
    "        X, y, alpha=0.001, maxLoop=5000, epsilon=0.00001, initTheta=initTheta)\n",
    "\n",
    "    # 绘制拟合成果\n",
    "    Xmin = X[:, 1].min()\n",
    "    Xmax = X[:, 1].max()\n",
    "    ymax = y[:, 0].max()\n",
    "    ymin = y[:, 0].min()\n",
    "    fitX = np.mat(np.linspace(Xmin, Xmax, 20).reshape(-1, 1))\n",
    "    fitX = np.concatenate((np.ones((fitX.shape[0], 1)), fitX), axis=1)\n",
    "\n",
    "    h = fitX * theta\n",
    "    plt.xlim(Xmin, Xmax)\n",
    "    plt.ylim(ymin, ymax)\n",
    "    # 绘制训练样本\n",
    "    plt.scatter(X[:, 1].flatten().A[0], y[:, 0].flatten().A[0],marker='x',color='r', linewidth=2)\n",
    "    # 绘制拟合曲线\n",
    "    plt.plot(fitX[:, 1], h, color='b')\n",
    "    plt.xlabel(u'水位变化(x)')\n",
    "    plt.ylabel(u'大坝流量(y)')\n",
    "    plt.show()\n",
    "\n",
    "    # 绘制随样本规模学习曲线\n",
    "    m, n = X.shape\n",
    "    trainErrors = np.zeros((1,m))\n",
    "    valErrors = np.zeros((1,m))\n",
    "    for i in range(m):\n",
    "        Xtrain = X[0:i+1]\n",
    "        ytrain = y[0:i+1]\n",
    "        # 注意，这里我们没有设置theLambda，实际上没有必要\n",
    "        theta, errors = gradient(\n",
    "            Xtrain, ytrain, alpha=0.001, maxLoop=10000, epsilon=0.00001)\n",
    "        \n",
    "        trainErrors[0,i] = J(theta, Xtrain, ytrain)\n",
    "        valErrors[0,i] = J(theta, Xval, yval)\n",
    "\n",
    "    print u'最小交叉验证误差', valErrors.ravel()[-1]\n",
    "    plt.plot(np.arange(1,m+1).ravel(), trainErrors.ravel(), color='b', label=u'测试误差')\n",
    "    plt.plot(np.arange(1,m+1).ravel(), valErrors.ravel(), color='g', label=u'交叉验证误差')\n",
    "    plt.title(u'线性回归学习曲线')\n",
    "    plt.xlabel(u'训练样本量')\n",
    "    plt.ylabel(u'误差')\n",
    "    plt.legend()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "diagnoseLR()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def diagnosePR():\n",
    "    \"\"\"多项式回归诊断\n",
    "    \"\"\"\n",
    "    # 多项式回归\n",
    "    poly = PolynomialFeatures(degree=10)\n",
    "    XX, XXval, XXtest = [normalize(\n",
    "        np.mat(poly.fit_transform(data[:, 1:]))) for data in [X, Xval, Xtest]]\n",
    "    initTheta = np.mat(np.ones((XX.shape[1], 1)))\n",
    "    theLambdas = [0, 0.01, 0.02, 0.03, 0.04, 0.05,0.06,0.07, 0.08, 0.09,0.10, 0.11, 0.12,0.13,0.14,0.15,0.2,0.3,0.5]\n",
    "    numTheLambdas = len(theLambdas)\n",
    "    trainErrors = np.zeros((1, numTheLambdas))\n",
    "    valErrors = np.zeros((1, numTheLambdas))\n",
    "    thetas = []\n",
    "    for idx, theLambda in enumerate(theLambdas):\n",
    "        theta, errors = gradient(\n",
    "            XX, y, alpha=0.1, maxLoop=10000, epsilon=0.0001,\n",
    "            theLambda=theLambda, initTheta=initTheta)\n",
    "        \n",
    "        thetas.append(theta)\n",
    "        # 训练误差、交叉验证误差，不需要考虑模型复杂度，所以theLambda不需要设置或者设置成0\n",
    "        trainErrors[0, idx] = J(theta,XX,y)\n",
    "        valErrors[0, idx] = J(theta, XXval, yval)\n",
    "        print theLambda, valErrors[0,idx]\n",
    "\n",
    "    bestLambda = theLambdas[np.argmin(valErrors)]\n",
    "    theta = thetas[np.argmin(valErrors)]\n",
    "    error = np.min(valErrors)\n",
    "\n",
    "    # # 选择lambda\n",
    "    plt.plot(theLambdas, trainErrors.ravel(), color='b',label=u'训练误差')\n",
    "    plt.plot(theLambdas, valErrors.ravel(), color='g',label=u'交叉验证误差')\n",
    "    plt.title(u'多项式回归选择λ')\n",
    "    plt.xlabel(u'λ')\n",
    "    plt.ylabel(u'误差')\n",
    "    plt.legend()\n",
    "    plt.show()\n",
    "\n",
    "    # 绘制拟合曲线\n",
    "    fitX = np.mat(np.linspace(-60, 45).reshape(-1, 1))\n",
    "    fitX = np.concatenate((np.ones((fitX.shape[0], 1)), fitX), axis=1)\n",
    "    fitXX = normalize(np.mat(poly.fit_transform(fitX[:, 1:])))\n",
    "    h = fitXX * theta\n",
    "    print theta\n",
    "    plt.title(u'多项式回归拟合曲线(lambda=%.3f) \\n  交叉验证误差=%.3f' % (bestLambda, error))\n",
    "    \n",
    "    plt.scatter(X[:, 1].tolist(), y[:, 0].tolist(), marker='x', color='r', linewidth=3)\n",
    "    plt.plot(fitX[:, 1], h, color='b')\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "diagnosePR()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
